⚙️

Chips Analysis - Task 2

#modify datatype 
df1['DATE'] =pd.to_datetime(df1['DATE'])


#extract Year and Month number as String - YYYYMM format
df1['month_id'] = df1['DATE'].dt.strftime('%Y%m')

#define metrics:  For each store and month 
#  total sales, number of customers,Transactions,Total Quantity
grouped = df1.groupby(['STORE_NBR','month_id'])

metrics = grouped.agg(
    totSales = ('TOT_SALES','sum'),
    nCustomers = ('LYLTY_CARD_NBR', 'nunique'),
    nTranactions=('TXN_ID','nunique'),
    totalqty = ('PROD_QTY','sum')

).reset_index()

#transactions per customer
#chips per customer
#Average Price per unit
metrics['nTxnPerCust'] = metrics['nTranactions'] / metrics['nCustomers']
metrics['nChipsPerTxn'] = metrics['totalqty'] / metrics['nTranactions']
metrics['avgPricePerUnit'] = metrics['totSales'] / metrics['totalqty']

#month_id to integer
metrics['month_id'] = metrics['month_id'].astype(int)

#stores with 12 full months
store_month_counts = metrics.groupby('STORE_NBR')['month_id'].nunique()
stores_with_full_obs = store_month_counts[store_month_counts == 12].index.tolist()

#Pre-trial period (< 2019 02)
pre_trial_measures = metrics[
    (metrics['month_id'] < 201902) &
    (metrics['STORE_NBR'].isin(stores_with_full_obs))

# Function to calculate correlation for a measure.
# Looping through each control store.
 
def calculate_correlation(input_df, metric_col, store_comparison):
    # Get all other stores (control candidates)
    store_numbers = input_df['STORE_NBR'].unique()
    store_numbers = [store for store in store_numbers if store != store_comparison]
    
    corr_results = []

    for store in store_numbers:
        # Filter data for each store
        trial_data = input_df[input_df['STORE_NBR'] == store_comparison][['month_id', metric_col]]
        control_data = input_df[input_df['STORE_NBR'] == store][['month_id', metric_col]]

        # Merge on month_id
        merged = pd.merge(trial_data, control_data, on='month_id', suffixes=('_trial', '_control'))

        # Calculate correlation
        corr = merged[f'{metric_col}_trial'].corr(merged[f'{metric_col}_control'])

        # Save result
        corr_results.append({
            'Store1': store_comparison,
            'Store2': store,
            'corr_measure': corr
        })
    
    return pd.DataFrame(corr_results)
    
    
    
 correlation_table = calculate_correlation(metrics, 'totSales', 77)
 print(correlation_table.head())

#A standardised metric based on the
#absolute difference between the trial store's performance 
#and each control store's performance.
def calculate_magnitude_distance(input_df, metric_col, store_comparison):
    """
    input_df: DataFrame with columns ['STORE_NBR', 'month_id', <metric_col>]
    metric_col: str, name of the metric column to compare (e.g. 'totSales')
    store_comparison: int, the trial store number
    """
    
    control_stores = input_df['STORE_NBR'].unique()
    control_stores = [s for s in control_stores if s != store_comparison]

    result_rows = []

    for store in control_stores:
        # Filter data for each store
        trial_data = input_df[input_df['STORE_NBR'] == store_comparison][['month_id', metric_col]]
        control_data = input_df[input_df['STORE_NBR'] == store][['month_id', metric_col]]

        # Merge on month_id
        merged = pd.merge(trial_data, control_data, on='month_id', suffixes=('_trial', '_control'))

        # Calculate absolute difference
        merged['measure'] = abs(merged[f'{metric_col}_trial'] - merged[f'{metric_col}_control'])

        # Add store numbers to result
        merged['Store1'] = store_comparison
        merged['Store2'] = store

        # Only keep necessary columns
        result_rows.append(merged[['Store1', 'Store2', 'month_id', 'measure']])

    # Combine all results
    calc_dist_df = pd.concat(result_rows, ignore_index=True)

    return calc_dist_df
trial_store = 77

corr_nSales = calculate_correlation(metrics, 'totSales', trial_store)
corr_nCustomers = calculate_correlation(metrics, 'nCustomers', trial_store)